(in-package :sb-simd-avx)

(defun simd-dot (array1 array2 &aux (n (min (array-total-size array1) (array-total-size array2))))
  (declare (type (simple-array double-float 1) array1 array2)
           (optimize speed (safety 0)))
  (do ((index 0 (the (integer 0 #.(- array-total-size-limit 16)) (+ index 16)))
       (acc1 (f64.4 0) (f64.4+ acc1 (f64.4* (f64.4-row-major-aref array1 (+ index 0))
                                            (f64.4-row-major-aref array2 (+ index 0)))))
       (acc2 (f64.4 0) (f64.4+ acc2 (f64.4* (f64.4-row-major-aref array1 (+ index 4))
                                            (f64.4-row-major-aref array2 (+ index 4)))))
       (acc3 (f64.4 0) (f64.4+ acc3 (f64.4* (f64.4-row-major-aref array1 (+ index 8))
                                            (f64.4-row-major-aref array2 (+ index 8)))))
       (acc4 (f64.4 0) (f64.4+ acc4 (f64.4* (f64.4-row-major-aref array1 (+ index 12))
                                            (f64.4-row-major-aref array2 (+ index 12))))))
      ((>= index (- n 16))
       (do ((result (f64.4-horizontal+ (f64.4+ acc1 acc2 acc3 acc4))
                    (+ result (* (row-major-aref array1 index)
                                 (row-major-aref array2 index))))
            (index index (1+ index)))
           ((>= index n) result)))))
